artists = read.csv("Artists.csv")
events = read.csv("Events.csv")
linkartists = read.csv("LinkArtists.csv")
# converting data into propper format
events = events %>% mutate(Timestamp = ymd_hms(events$Timestamp),
Year = year(Timestamp),
Month = month(Timestamp),
Day = day(Timestamp),
Hour = hour(Timestamp),
Quarter = quarter(Timestamp)
)
# modifying column names to match
colnames(linkartists)[which(names(linkartists)=="X.ArtistId")] ="ArtistId"
colnames(events)[which(names(events)=="Linkid")] ="LinkId"
#combining tables
linkartists = linkartists %>% left_join(artists)
## Joining, by = "ArtistId"
#switching LinkId to lowercase to match events data
linkartists = linkartists %>% mutate(LinkId = tolower(LinkId))
#joining linkartists to match LinkId with Artist
events = events %>% left_join(linkartists)
## Joining, by = "LinkId"
## Warning: Column `LinkId` joining factor and character vector, coercing into
## character vector
Looking at most frequent or most popular values in each field
# Summary of browsers used in descending order
sort(summary(events$Browser),decreasing=T)
## Instagram App Safari Facebook App
## 8117 7031 1822
## Chrome Mobile Safari UIWebView Line App
## 773 546 327
## Google App Snapchat App Firefox
## 182 165 143
## Weibo Twitter App Facebook Messenger
## 106 86 62
## Edge Samsung Browser Generic Ruby Crawler
## 23 16 7
## UC Browser WeChat App Firefox for iOS
## 4 4 3
## Opera Yandex Browser Apple Mail
## 3 3 2
## Coc Coc Browser Puffin Whale Browser
## 2 2 2
## Android WebView CM Browser Iron
## 1 1 1
## Mint Browser Opera Mini
## 1 1
# Top 5 Cities
head(sort(summary(events$City),decreasing=T))
## (Other) Unknown Lagos Taipei Tokyo Osaka
## 11432 1847 441 259 215 193
# Top 5 countries
head(sort(summary(events$Countrycode),decreasing=T))
## US JP GB RU FR CA
## 5621 4603 819 665 596 592
# Table of Member Types
sort(summary(events$Convtype),decreasing=T)
## trial membership
## 11189 8247
# Table of Member Plans
sort(summary(events$Convname),decreasing=T)
## Individual plan Student plan Family plan
## 17686 1165 585
# Table of Devices
sort(summary(events$Device),decreasing=T)
## mobile desktop tablet Mobile Phone Mobile Device
## 17839 956 287 251 79
## Desktop crawler Tablet
## 13 7 4
Method I wanted to looking at Linkfire visitors who had visited at least twice in order to see whether or not they were new subscribers or winback customers.
I assumed that visitors who were once on trial and then began memberships became new subscribers.
For Winback customers, I looked for users who were members before going on a trial period and then back to being members. I also counted the number of times a single user resubscribed to their streaming service if they switched between trial and membership services multiple times.
repeated_id = events$Visitortoken[duplicated(events$Visitortoken)]
# All users that have visited Linkfire more than once
repeated_id = unique(repeated_id)
events_repeat = events[events$Visitortoken %in% repeated_id,]
# Most frequent users
head(sort(summary(events$Visitortoken),decreasing = T))
## (Other) c4c24e5ead78d4a3a7edxxxxxxxxxxxx
## 18449 46
## 1c0445de0b6266964690xxxxxxxxxxxx df30982ac360fad69fcaxxxxxxxxxxxx
## 26 22
## a50bd11b4c71fb0a5e32xxxxxxxxxxxx 472c7c3fecfc63152692xxxxxxxxxxxx
## 17 16
events_repeat = events_repeat[order(events_repeat$Visitortoken,events_repeat$Timestamp),]
events_repeat$new_sub = rep(0, nrow(events_repeat))
events_repeat$win_back = rep(0,nrow(events_repeat))
new_id = 0
lost = 0
was_member = 0
on_trial = 0
for(i in 1:nrow(events_repeat)) {
# Checks if looking at new user
if(new_id == 1 && events_repeat$Visitortoken[i-1]!=events_repeat$Visitortoken[i]) {
new_id = 0
lost = 0
was_member = 0
on_trial = 0
}
# Assigns values if this is a user's first visit
if(new_id == 0) {
new_id=1
if(events_repeat$Convtype[i]=="trial") {on_trial=1;
} else if(events_repeat$Convtype[i]=="membership") {was_member=1; }
next
}
# Checks to see if user becomes new subscriber or winback
if(new_id == 1) {
if(events_repeat$Convtype[i]=="trial" && on_trial==1 && lost == 0) {
} else if(events_repeat$Convtype[i] == "membership" && was_member == 1 && lost == 0) { on_trial = 0; was_member=1;
} else if(events_repeat$Convtype[i] == "trial" && was_member == 1) {lost=1; on_trial= 0;
} else if(events_repeat$Convtype[i] == "membership" && lost == 1 && on_trial==0 && was_member==1) {events_repeat$win_back[i]=1; lost=0;
} else if(events_repeat$Convtype[i] == "membership" && on_trial==1 && lost == 0) {events_repeat$new_sub[i]=1;on_trial=0;was_member=1;}
}
}
# Number new subscribed members
sum(events_repeat$new_sub)
## [1] 2480
# Instances of customers returning to their subscriptions
sum(events_repeat$win_back)
## [1] 8
# Number of unique visitors total
length(unique(events$Visitortoken))
## [1] 14507
# 17% of Linkfire visitors who become new subscribers
2480/14507 * 100
## [1] 17.0952
# .05% of Linkfire visitors who resubscribe
8/14507 * 100
## [1] 0.05514579
# Number of unique linkfire visitors in 2019
length(unique(events$Visitortoken[which(events$Year==2019,)]))
## [1] 6382
# Number of unique linkfire visitors in 2020
length(unique(events$Visitortoken[which(events$Year==2020,)]))
## [1] 9811
# Percent change in Linkfire visitors from 2019 to 2020
((9811-6382)/6382)*100
## [1] 53.72924
Comparing different fields by the percentage of new subscribers or Winbacks
# City with highest percentage of new subscribers
city_new_sub = aggregate(events_repeat$new_sub/2480*100~events_repeat$City, data = events_repeat, FUN = sum)
city_new_sub = city_new_sub %>% arrange(desc(city_new_sub[,2]))
head(city_new_sub)
## events_repeat$City events_repeat$new_sub/2480 * 100
## 1 Unknown 11.8548387
## 2 Taipei 2.1370968
## 3 Moscow 1.3709677
## 4 Lagos 1.2096774
## 5 Tokyo 1.0483871
## 6 Osaka 0.9677419
# City with highest percentage of winbacks
city_win_back = aggregate(events_repeat$win_back/8*100~events_repeat$City, data = events_repeat, FUN = sum)
city_win_back = city_win_back %>% arrange(desc(city_win_back[,2]))
head(city_win_back)
## events_repeat$City events_repeat$win_back/8 * 100
## 1 Bridgeport 50.0
## 2 Roppongi 25.0
## 3 East Hanover 12.5
## 4 Moscow 12.5
## 5 0.0
## 6 Aabenraa 0.0
# Countries with highest percentage of new subscribers
country_new_sub = aggregate(events_repeat$new_sub/2480*100~events_repeat$Countrycode, data = events_repeat, FUN = sum)
country_new_sub = country_new_sub %>% arrange(desc(country_new_sub[,2]))
head(country_new_sub)
## events_repeat$Countrycode events_repeat$new_sub/2480 * 100
## 1 JP 28.387097
## 2 US 27.459677
## 3 RU 4.475806
## 4 GB 4.435484
## 5 TW 3.548387
## 6 FR 3.508065
# Countries with highest percentage of winbacks
country_win_back = aggregate(events_repeat$win_back/8*100~events_repeat$Countrycode, data = events_repeat, FUN = sum)
country_win_back = country_win_back %>% arrange(desc(country_win_back[,2]))
head(country_win_back)
## events_repeat$Countrycode events_repeat$win_back/8 * 100
## 1 US 62.5
## 2 JP 25.0
## 3 RU 12.5
## 4 AE 0.0
## 5 AR 0.0
## 6 AT 0.0
# Subscription Plans and percentage of new subscribers
plan_new_sub = aggregate(events_repeat$new_sub/2480*100~events_repeat$Convname,FUN = sum)
plan_new_sub = plan_new_sub %>% arrange(desc(plan_new_sub[,2]))
head(plan_new_sub)
## events_repeat$Convname events_repeat$new_sub/2480 * 100
## 1 Individual plan 88.104839
## 2 Student plan 8.024194
## 3 Family plan 3.870968
# Subscription Plans and percentage of winbacks
plan_win_back = aggregate(events_repeat$win_back/8*100~events_repeat$Convname,FUN = sum)
plan_win_back = plan_win_back %>% arrange(desc(plan_win_back[,2]))
head(plan_win_back)
## events_repeat$Convname events_repeat$win_back/8 * 100
## 1 Individual plan 100
## 2 Family plan 0
## 3 Student plan 0
# Devices and percentage of new customers
device_new_sub = aggregate(events_repeat$new_sub/2480*100~events_repeat$Device,FUN = sum)
device_new_sub = device_new_sub %>% arrange(desc(device_new_sub[,2]))
device_new_sub
## events_repeat$Device events_repeat$new_sub/2480 * 100
## 1 mobile 91.81451613
## 2 desktop 5.04032258
## 3 tablet 1.77419355
## 4 Mobile Device 1.20967742
## 5 Mobile Phone 0.08064516
## 6 crawler 0.04032258
## 7 Desktop 0.04032258
## 8 Tablet 0.00000000
# Devices and percentage of winback
device_win_back = aggregate(events_repeat$win_back/8*100~events_repeat$Device,FUN = sum)
device_win_back = device_win_back %>% arrange(desc(device_win_back[,2]))
device_win_back
## events_repeat$Device events_repeat$win_back/8 * 100
## 1 desktop 87.5
## 2 mobile 12.5
## 3 crawler 0.0
## 4 Desktop 0.0
## 5 Mobile Device 0.0
## 6 Mobile Phone 0.0
## 7 tablet 0.0
## 8 Tablet 0.0
# Years and percentage of new subscribers
year_new_sub = aggregate(events_repeat$new_sub/2480*100~events_repeat$Year,FUN = sum)
year_new_sub = year_new_sub %>% arrange(desc(year_new_sub[,2]))
year_new_sub
## events_repeat$Year events_repeat$new_sub/2480 * 100
## 1 2020 78.10484
## 2 2019 21.89516
# Years and percentage of winback
year_win_back = aggregate(events_repeat$win_back/8*100~events_repeat$Year,FUN = sum)
year_win_back = year_win_back %>% arrange(desc(year_win_back[,2]))
year_win_back
## events_repeat$Year events_repeat$win_back/8 * 100
## 1 2020 75
## 2 2019 25
# Months and percentage of new subscribers
month_new_sub = aggregate(events_repeat$new_sub/2480*100~events_repeat$Month,FUN = sum)
month_new_sub = month_new_sub %>% arrange((month_new_sub[,1]))
month_new_sub
## events_repeat$Month events_repeat$new_sub/2480 * 100
## 1 1 26.97580645
## 2 2 15.20161290
## 3 3 26.00806452
## 4 4 9.91935484
## 5 9 0.08064516
## 6 10 2.09677419
## 7 11 0.28225806
## 8 12 19.43548387
# Months and percentage of winback
month_win_back = aggregate(events_repeat$win_back/8*100~events_repeat$Month,FUN = sum)
month_win_back = month_win_back %>% arrange((month_win_back[,1]))
month_win_back
## events_repeat$Month events_repeat$win_back/8 * 100
## 1 1 37.5
## 2 2 12.5
## 3 3 12.5
## 4 4 12.5
## 5 9 0.0
## 6 10 0.0
## 7 11 0.0
## 8 12 25.0
# Artists with the highest percentage of new subscribers
artist_new_sub = aggregate(events_repeat$new_sub/2480*100~events_repeat$Name,FUN = sum)
artist_new_sub = artist_new_sub %>% arrange(desc(artist_new_sub[,2]))
artist_new_sub[1:10,]
## events_repeat$Name events_repeat$new_sub/2480 * 100
## 1 Jackson Wang 3.5080645
## 2 Sunday Service Choir 2.5806452
## 3 yui (FLOWER FLOWER) × ミゾベリョウ (odol) 0.9274194
## 4 Various Artists 0.8467742
## 5 MEZZO” 0.7661290
## 6 88rising 0.7258065
## 7 Rich Brian 0.7258065
## 8 Yoga Lin 0.7258065
## 9 Phương Anh Đào 0.6854839
## 10 Tayc 0.6854839
# Artists with the highest percentage of winback
artist_win_back = aggregate(events_repeat$win_back/8*100~events_repeat$Name,FUN = sum)
artist_win_back = artist_win_back %>% arrange(desc(artist_win_back[,2]))
artist_win_back[1:10,]
## events_repeat$Name events_repeat$win_back/8 * 100
## 1 Eye on Eyez 50.0
## 2 Camilo 12.5
## 3 Sunday Service Choir 12.5
## 4 0.0
## 5 .mt 0.0
## 6 [Alexandros] 0.0
## 7 04 Limited Sazabys 0.0
## 8 1070725 0.0
## 9 2Baba 0.0
## 10 2live 0.0
year_new_sub = aggregate(events_repeat$new_sub~events_repeat$Year,FUN = sum)
# Percentage change in number of new subscribers who used Linkfire from 2019 to 2020
(1937-543)/543*100
## [1] 256.7219
year_win_back = aggregate(events_repeat$win_back~events_repeat$Year,FUN = sum)
# Percentage change in number of winbacks who used Linkfire from 2019 to 2020
(6-2)/2*100
## [1] 200
(hover over the bars to reveal more info)
g = ggplot(device_new_sub, aes(x=device_new_sub[,1],y=device_new_sub[,2])) + geom_bar(stat="identity") + xlab("Devices") + ylab("Number of New Subscribers") +ggtitle("Devices used by New Subscribers")
ggplotly(g) # people who are new subscribers are on phones
g = ggplot(device_win_back, aes(x=device_win_back[,1],y=device_win_back[,2])) + geom_bar(stat="identity") + xlab("Devices") + ylab("Number of Winbacks") +ggtitle("Devices used by Winbacks")
ggplotly(g) # people who are won back are on desktops
g = ggplot(month_new_sub, aes(x=month_new_sub[,1],y=month_new_sub[,2])) + geom_bar(stat="identity") + xlab("Months") + ylab("Number of New Subscribers") +ggtitle("Monthly count of New Subscribers")
ggplotly(g) # there is an increase
g = ggplot(month_win_back, aes(x=month_win_back[,1],y=month_win_back[,2])) + geom_bar(stat="identity") + xlab("Months") + ylab("Number of Winbacks") +ggtitle("Monthly count of Winbacks")
ggplotly(g)
g = ggplot(year_new_sub, aes(x=year_new_sub[,1],y=year_new_sub[,2])) + geom_bar(stat="identity") + xlab("Year") + ylab("Number of New Subscribers") +ggtitle("Yearly count of New Subscribers")
ggplotly(g)
g = ggplot(year_win_back, aes(x=year_win_back[,1],y=year_win_back[,2])) + geom_bar(stat="identity") + xlab("Year") + ylab("Number of Winbacks") +ggtitle("Yearly count of Winbacks")
ggplotly(g)
g = ggplot(artist_new_sub[1:10,], aes(x=artist_new_sub[1:10,1],y=artist_new_sub[1:10,2])) + geom_bar(stat="identity") + xlab("Artists") + ylab("Number of New Subscribers") +ggtitle("Artists with most New Subscribers")
ggplotly(g)
g = ggplot(artist_win_back[1:3,], aes(x=artist_win_back[1:3,1],y=artist_win_back[1:3,2])) + geom_bar(stat="identity") + xlab("Artists") + ylab("Number of Winbacks") +ggtitle("Artists with most Winbacks")
ggplotly(g)